In [ ]:
manager <- c(1, 2, 3, 4, 5)
date <- c("10/24/08", "10/28/08", "10/1/08", "10/12/08", "5/1/09")
country <- c("US", "US", "UK", "UK", "UK")
gender <- c("M", "F", "F", "M", "F")
age <- c(32, 45, 25, 39, 99)
q1 <- c(5, 3, 3, 3, 2)
q2 <- c(4, 5, 5, 3, 2)
q3 <- c(5, 2, 5, 4, 1)
q4 <- c(5, 5, 5, NA, 2)
q5 <- c(5, 5, 2, NA, 1)
leadership <- data.frame(manager, date, country, gender, age,
q1, q2, q3, q4, q5, stringsAsFactors=FALSE)
In [ ]:
leadership
In [ ]:
# Preferred way use transform()
mydata<-data.frame(x1 = c(2, 2, 6, 4),
x2 = c(3, 4, 2, 8))
mydata
In [ ]:
mydata <- transform(mydata, sumx = x1 + x2, meanx = (x1 + x2)/2)
mydata
In [ ]:
leadership$age[leadership$age == 99] <- NA
In [ ]:
leadership
In [ ]:
leadership <- within(leadership, {
agecat <- NA
agecat[age > 75] <- "Elder"
agecat[age >= 55 & age <= 75] <- "Middle Aged"
agecat[age < 55] <- "Young"
}
)
leadership
In [ ]:
# Can use plyr rename() function
# rename(dataframe, c(oldname="newname", oldname="newname",...))
In [ ]:
names(leadership)
In [ ]:
names(leadership)[2] <- "testDate"
In [ ]:
leadership
In [ ]:
# NA is not comparable, even to itself
# is.nan() and is.infinite() to test not a number and infinite
# some functions, e.g. sum can set sum(na.rm = TRUE)
# use na.omit(dataframe) to remove any data with NA
In [ ]:
y <- c(1, 2, 3, NA)
is.na(y)
In [ ]:
is.nan(y)
In [ ]:
is.na(leadership[,6:10])
In [ ]:
is.nan(sin(Inf))
In [ ]:
is.infinite(Inf)
In [ ]:
is.infinite(sin(Inf))
In [ ]:
x <- c(1, 2, NA, 3)
sum(x)
In [ ]:
sum(x, na.rm = TRUE)
In [ ]:
leadership
In [ ]:
newdata <- na.omit(leadership)
newdata
In [ ]:
# Useful functions
# as.Date(str, format)
# Sys.Date()
# date()
# format(Dateobject, format string)
# help(as.Date), help(strftime)
# package: lubridate, timeDate
In [ ]:
mydates <- as.Date(c("2007-06-22", "2004-02-13"))
In [ ]:
mydates
In [ ]:
strDates <- c("01/05/1965", "08/16/1975")
dates <- as.Date(strDates, "%m/%d/%Y")
In [ ]:
dates
In [ ]:
format <- "%m/%d/%y"
leadership$testDate <- as.Date(leadership$testDate, format)
leadership
In [ ]:
Sys.Date()
In [ ]:
date()
In [ ]:
today <- Sys.Date()
format(today, format = "%B %d %Y")
In [ ]:
format(today, format = "%A")
In [ ]:
endDate <- as.Date("2017-01-13")
startDate <- as.Date("1985-07-29")
days <- endDate - startDate
days
In [ ]:
difftime(startDate, endDate, units = "week")
In [ ]:
# [is|as].[numeric|character|vector|matrix|data.frame|factor|logic]
In [ ]:
a <- c(1, 2, 3)
is.numeric(a)
In [ ]:
is.vector(a)
In [ ]:
a <- as.character(a)
a
In [ ]:
print(is.numeric(a))
print(is.vector(a))
print(is.character(a))
In [ ]:
attach(leadership)
newdata <- leadership[order(gender, age),]
detach(leadership)
newdata
In [ ]:
attach(leadership)
newdata <- leadership[order(gender, -age),]
detach(leadership)
newdata
In [ ]:
# merge: total <- merge(dataframeA, dataframeB, by=c("ID","Country"))
# cbind: total <- cbind(A, B)
# rbind: total <- rbind(dataframeA, dataframeB)
In [ ]:
# select column
# select observation
# subset() function
# sample() function to random sample, sampling and survey package
In [ ]:
newdata <- leadership[, c(6:10)]
newdata
In [ ]:
myvars <- c("q1", "q2", "q3", "q4", "q5")
newdata <- leadership[, myvars]
newdata
In [ ]:
myvars <- names(leadership) %in% c("q3", "q4")
myvars
In [ ]:
newdata <- leadership[!myvars]
newdata
In [ ]:
newdata <- leadership[,-c(8, 9)]
newdata
In [ ]:
leadership$testDate <- as.Date(leadership$testDate, "%m/%d/%y")
startdate <- as.Date("2009-01-01", format = "%Y-%m-%d")
enddate <- as.Date("2009-10-31")
print(startdate)
In [ ]:
leadership
In [ ]:
leadership[leadership$testDate >= startdate,]
In [ ]:
newdata <- subset(leadership, age > 24 & age <= 35, select = age:q5)
newdata
In [ ]:
newdata <- subset(leadership, gender == "M" & age <= 35, select = country:q5)
newdata
In [ ]:
newdata <- leadership[sample(1:nrow(leadership), 3, replace = FALSE),]
newdata